import numpy as np
from auxiliarymethods import datasets as dp
import auxiliarymethods.auxiliary_methods as aux
from sklearn.metrics.cluster import normalized_mutual_info_score
from matplotlib import pyplot as plt
from auxiliarymethods import reader
import pandas as pd
from sources import graph_analysis,clustering,dimensionality_reduction,outlier_detection, visualization, utility_functions
import networkx as nx
import seaborn as sns
Load imdb-network:
To limit the sheer amount of exploratory data analysis that could be done on this dataset, we developed the following three questions, which we will try to explore and answer:
Before starting with the first question, we load the data and do some basic graph analysis/plot the most important features to get some first impression of the imdb dataset and the different kernel representation.
imdb_networkx = reader.tud_to_networkx("IMDB-BINARY")
Load each dataset and save it in variables:
classes = dp.get_dataset("IMDB-BINARY")
imdb_wl1_vectors = aux.normalize_feature_vector(utility_functions.load_sparse("../../graph_representations/without_labels/IMDB-BINARY_vectors_wl1.npz"))
imdb_wl2_vectors = aux.normalize_feature_vector(utility_functions.load_sparse("../../graph_representations/without_labels/IMDB-BINARY_vectors_wl2.npz"))
imdb_wl3_vectors = aux.normalize_feature_vector(utility_functions.load_sparse("../../graph_representations/without_labels/IMDB-BINARY_vectors_wl3.npz"))
imdb_wl4_vectors = aux.normalize_feature_vector(utility_functions.load_sparse("../../graph_representations/without_labels/IMDB-BINARY_vectors_wl4.npz"))
imdb_wl5_vectors = aux.normalize_feature_vector(utility_functions.load_sparse("../../graph_representations/without_labels/IMDB-BINARY_vectors_wl5.npz"))
imdb_graphlet_vectors = aux.normalize_feature_vector(utility_functions.load_sparse("../../graph_representations/without_labels/IMDB-BINARY_vectors_graphlet.npz"))
imdb_shortestpath_vectors = aux.normalize_feature_vector(utility_functions.load_sparse("../../graph_representations/without_labels/IMDB-BINARY_vectors_shortestpath.npz"))
imdb_wl1_gram = aux.normalize_gram_matrix(utility_functions.load_csv("../../graph_representations/without_labels/IMDB-BINARY_gram_matrix_wl1.csv"))
imdb_wl2_gram = aux.normalize_gram_matrix(utility_functions.load_csv("../../graph_representations/without_labels/IMDB-BINARY_gram_matrix_wl2.csv"))
imdb_wl3_gram = aux.normalize_gram_matrix(utility_functions.load_csv("../../graph_representations/without_labels/IMDB-BINARY_gram_matrix_wl3.csv"))
imdb_wl4_gram = aux.normalize_gram_matrix(utility_functions.load_csv("../../graph_representations/without_labels/IMDB-BINARY_gram_matrix_wl4.csv"))
imdb_wl5_gram = aux.normalize_gram_matrix(utility_functions.load_csv("../../graph_representations/without_labels/IMDB-BINARY_gram_matrix_wl5.csv"))
imdb_graphlet_gram = aux.normalize_gram_matrix(utility_functions.load_csv("../../graph_representations/without_labels/IMDB-BINARY_gram_matrix_graphlet.csv"))
imdb_shortestpath_gram = aux.normalize_gram_matrix(utility_functions.load_csv("../../graph_representations/without_labels/IMDB-BINARY_gram_matrix_shortestpath.csv"))
Applying dimensionality reduction on loaded data, to reduce the data to the most nessesary freatures. Here we reduce each of the datasets to 100 components.
tsvd_wl1_vectors = dimensionality_reduction.truncatedSVD(imdb_wl1_vectors, 100)
tsvd_wl2_vectors = dimensionality_reduction.truncatedSVD(imdb_wl2_vectors, 100)
tsvd_wl3_vectors = dimensionality_reduction.truncatedSVD(imdb_wl3_vectors, 100)
tsvd_wl4_vectors = dimensionality_reduction.truncatedSVD(imdb_wl4_vectors, 100)
tsvd_wl5_vectors = dimensionality_reduction.truncatedSVD(imdb_wl5_vectors, 100)
tsvd_graphlet_vectors = dimensionality_reduction.truncatedSVD(imdb_graphlet_vectors, 100)
tsvd_shortestpath_vectors = dimensionality_reduction.truncatedSVD(imdb_shortestpath_vectors, 100)
kpca_wl1_gram = dimensionality_reduction.kernelPCA(imdb_wl1_gram, 100)
kpca_wl2_gram = dimensionality_reduction.kernelPCA(imdb_wl2_gram, 100)
kpca_wl3_gram = dimensionality_reduction.kernelPCA(imdb_wl3_gram, 100)
kpca_wl4_gram = dimensionality_reduction.kernelPCA(imdb_wl4_gram, 100)
kpca_wl5_gram = dimensionality_reduction.kernelPCA(imdb_wl5_gram, 100)
kpca_graphlet_gram = dimensionality_reduction.kernelPCA(imdb_graphlet_gram, 100)
kpca_shortestpath_gram = dimensionality_reduction.kernelPCA(imdb_shortestpath_gram, 100)
wl_listG = [kpca_wl1_gram,kpca_wl2_gram,kpca_wl3_gram,kpca_wl4_gram,kpca_wl5_gram]
wl_listV = [tsvd_wl1_vectors,tsvd_wl2_vectors,tsvd_wl3_vectors,tsvd_wl4_vectors,tsvd_wl5_vectors]
The svd_energy() function returns the amount of features, that are nessesary to obtain at least 90% of the energy. Here we show an example in the Weisfeiler-Lehman 1 dataset:
tsvd_wl1_vectors_energy = dimensionality_reduction.truncatedSVD(imdb_wl1_vectors, 100)
nrComponentsNeededToObtainEnergy = dimensionality_reduction.svd_energy(imdb_wl1_vectors, 100)
print("The amount of features that should remain to obtain 90% engergy: ", nrComponentsNeededToObtainEnergy)
tsvd_wl1_vectors_energy = np.delete(tsvd_wl1_vectors_energy, range(nrComponentsNeededToObtainEnergy-1, 99), 1)
print("The shape of the data after deleting the redundand columns ",tsvd_wl1_vectors_energy.shape)
graphs_romance = imdb_networkx[0:500]
graphs_action = imdb_networkx[500:1001]
graph_analysis.getGraphDataByClass(imdb_networkx,'Entire Dataset')
print()
graph_analysis.getGraphDataByClass(graphs_romance,'ROMANCE')
print()
graph_analysis.getGraphDataByClass(graphs_action,'ACTION')
Here we collected some important information about the graphs. In the first block we look at the entire dataset-network, then at the romance-graphs and at the bottom at the action-graphs. We discovered two interesting things:
The average density for romance-graphs is 7,7% higher (compared to the action-graphs) and the romance-graphs have more isomorphic graph-pairs (also compared to the action-graphs).
The other informations (average number of edges, nodes and edges per node) do not differ between the two genre-subsets to a notable extent.
print('Example graph for genre romance')
visualization.visualize(graphs_romance[0])
print()
print('Example graph for genre action')
visualization.visualize(graphs_action[0])
graph_analysis.getExtremeGraphs(imdb_networkx)
visualization.showEntireDataset(wl_listG, wl_listV, tsvd_graphlet_vectors, kpca_graphlet_gram, tsvd_shortestpath_vectors, kpca_shortestpath_gram, classes)
After analysing the different plots, we noticed the following things:
Note: Since we have now shown that the shapes of the Weisfeller-Lehman datasets are rather similar, we will only look at dataset "Weisfeller-Lehman 5" in further data analysis.
To answer this question, we first need to define threshold to remove outliers from each representation. To do this, we created boxplots and then set the thresholds accordingly. In the code below we plot one boxplot each for the first two dimentions of the Weisfeiler-Lehman dataset, to help us determine the ranges, that classify datapoints as outliers for that specific data set. After the boxplots, we visualize 3 datasets. The "normal" Weisfeiler-Lehman, Weisfeiler-Lehman without outliers and the outliers of the dataset:
visualization.showBoxplot(kpca_wl5_gram, "Weisfeiler-Lehman 5 Boxplot")
visualization.scatterPlot2DBig(kpca_wl5_gram, "Weisfeiler-Lehman 5", classes)
wl5WithoutOutliers, wl5outlierIndex = outlier_detection.seperateOutliersWithRange(kpca_wl5_gram, -0.3, 0.4, -0.4, 0.4, returnOutliers=False)
visualization.scatterPlot2DBig(wl5WithoutOutliers, "Weisfeiler-Lehman 5 without outliers", classes)
wl5OnlyOutliers, wl5outlierIndex = outlier_detection.seperateOutliersWithRange(kpca_wl5_gram, -0.3, 0.4, -0.4, 0.4, returnOutliers=True)
visualization.scatterPlot2DBig(wl5OnlyOutliers, "Weisfeiler-Lehman 5 only outliers", classes)
outlier_detection.printOutlierCount(wl5outlierIndex, "Analyzing the Weisfeiler-Lehman 5 outliers")
In the code below we plot one boxplot each for the first two dimentions of the Shortest Path dataset, to help us determine the ranges, that classify datapoints as outliers for that specific data set. After the boxplots, we visualize 3 datasets. The "normal" Shortest Path, Shortest Path without outliers and the outliers of the dataset:
visualization.showBoxplot(kpca_shortestpath_gram, "Shortest path boxplot")
visualization.scatterPlot2DBig(kpca_shortestpath_gram, "Shorted Path", classes)
shortestPathWithoutOutliers, shortestPathOutlierIndex = outlier_detection.seperateOutliersWithRange(kpca_shortestpath_gram, -0.6, 0.55, -0.2, 0.175, returnOutliers=False)
visualization.scatterPlot2DBig(shortestPathWithoutOutliers, "Shorted Path without outliers", classes)
shortestPathOnlyOutliers, shortestPathOutlierIndex = outlier_detection.seperateOutliersWithRange(kpca_shortestpath_gram, -0.6, 0.55, -0.2, 0.175, returnOutliers=True)
visualization.scatterPlot2DBig(shortestPathOnlyOutliers, "Shorted Path without outliers", classes)
outlier_detection.printOutlierCount(shortestPathOutlierIndex, "Analyzing the Shortest Path outliers")
In the code below we plot one boxplot each for the first two dimentions of the Graphlet dataset, to help us determine the ranges, that classify datapoints as outliers for that specific data set. After the boxplots, we visualize 3 datasets. The "normal" Graphlet, Graphlet without outliers and the outliers of the dataset:
visualization.showBoxplot(kpca_graphlet_gram, "Graphlet boxplot")
visualization.scatterPlot2DBig(kpca_graphlet_gram, "Shorted Path", classes)
graphletWithoutOutliers, graphletOultiersIndex = outlier_detection.seperateOutliersWithRange(kpca_graphlet_gram, -0.6, 0.66, -0.2, 0.22, returnOutliers=False)
visualization.scatterPlot2DBig(graphletWithoutOutliers, "Shorted Path without outliers", classes)
graphletOnlyOutliers, graphletOultiersIndex = outlier_detection.seperateOutliersWithRange(kpca_graphlet_gram, -0.6, 0.66, -0.2, 0.22, returnOutliers=True)
visualization.scatterPlot2DBig(graphletOnlyOutliers, "Shorted Path without outliers", classes)
outlier_detection.printOutlierCount(graphletOultiersIndex, "Analyzing the Graphlet outliers")
As you have probaly noticed the amount of outliers you see the the visualizations above is way less than the actual number of outliers! Thats because the "what we defined as" outliers are stacked ontop of each other. The beste example herefore is the Weisfeller-Lehman 5 outlier plot above, where you can only see three data points, but these are actually 103 data points.
Note: We are sticking to the outliers we have found, because they are seperated from the rest of the data. The problem here is, that clustering will be difficult, if several outliers are really close to each other. We will handle this, by looking at datasets without the outliers.
sharedOutliers = [0] * 1000
sharedOutliersBool = [False] * 1000
for i in range(len(imdb_networkx)):
if wl5outlierIndex[i] == 1 and shortestPathOutlierIndex[i] == 1 and graphletOultiersIndex[i] == 1:
sharedOutliers[i] = 1
sharedOutliersBool[i] = True
outlier_detection.printOutlierCount(sharedOutliers, "SHARED OUTLIERS")
imbd_networkWithoutOutliers = []
imbd_networkOnlyOutliers = []
for i in range(len(imdb_networkx)):
if sharedOutliers[i] == 1:
imbd_networkOnlyOutliers.append(imdb_networkx[i])
else:
imbd_networkWithoutOutliers.append(imdb_networkx[i])
print()
graph_analysis.getGraphDataByClass(imbd_networkWithoutOutliers,'IMDB Network without outliers')
print()
graph_analysis.getGraphDataByClass(imbd_networkOnlyOutliers, 'IMDB Network only outliers')
If you look at the outliers analysis above, you see that they differ in every detail, in comparison to the network without outliers. Interesting to see is that the average density of the outlier ego-networks is 1. Therefore every node in any outlier ego-network is connected to every other node in the same ego-network.
If you calculate $\binom{48}{2}$, you calculate how many combinations of outlier ego-networks there are. The result (=1128) equals the number of isomorphic pairs in the outlier ego-networks. Resulting that proves, that the outlier ego-networks are all the same.
for i in range(0,len(imbd_networkOnlyOutliers)):
visualization.visualize(imbd_networkOnlyOutliers[i])
For this, we created the 3 following three features:
We concatenate each feature seperately to the existing kernel representation of Weisfeller-Lemann 5 and plot the results below compared to the original dataset in 2D and 3D.
kpca_wl5_gram2 = dimensionality_reduction.kernelPCA(imdb_wl5_gram, 2)
kpca_wl5_gram3 = dimensionality_reduction.kernelPCA(imdb_wl5_gram, 3)
allDensities = [nx.density(graph) for graph in imdb_networkx]
allDensities = np.reshape(allDensities, (1000,1))
allEdges = [nx.number_of_edges(graph) for graph in imdb_networkx]
allEdges = np.reshape(allEdges, (1000,1))
allNodes = [nx.number_of_nodes(graph) for graph in imdb_networkx]
allNodes = np.reshape(allNodes, (1000,1))
visualization.plotWithExtraColumn(kpca_wl5_gram3, allDensities, classes, "Density")
visualization.plotWithExtraColumn(kpca_wl5_gram3, allEdges, classes, "Number of edges")
visualization.plotWithExtraColumn(kpca_wl5_gram3, allNodes, classes, "Number of nodes")
What we wanted to acheive here is to see it an extra feature is increasing the correctness of the clustering or not. Therefore we decided to make several versions of the Weisfeller-Lehman dataset:
kpca_wl5_gram = dimensionality_reduction.kernelPCA(imdb_wl5_gram, 100)
wl5WithoutOutliers = np.delete(wl5WithoutOutliers, np.where(wl5WithoutOutliers[:,0] == None)[0],0)
wl5LabelsReduced = clustering.kMeans(wl5WithoutOutliers[:,0:3],2)
wl5ExtraFeatureReducesLabels = np.append(kpca_wl5_gram2, allDensities, axis=1)
wl5ExtraFeatureReducesLabels = outlier_detection.removeOutliers(wl5ExtraFeatureReducesLabels, wl5outlierIndex)
wl5ExtraFeatureReducesLabels = np.delete(wl5ExtraFeatureReducesLabels, np.where(wl5ExtraFeatureReducesLabels[:,0] == None)[0],0)
wl5ExtraFeatureReducesLabels = clustering.kMeans(wl5ExtraFeatureReducesLabels[:,0:3],2)
classesWl5Reduces = outlier_detection.removeOutliers(classes, wl5outlierIndex)
classesWl5Reduces = np.delete(classesWl5Reduces, np.where(classesWl5Reduces == None)[0],0)
wl5ExtraFeatureLabels = np.append(kpca_wl5_gram2, allDensities, axis=1)
wl5ExtraFeatureLabels = clustering.kMeans(wl5ExtraFeatureLabels,2)
print(normalized_mutual_info_score(classes, clustering.kMeans(kpca_wl5_gram[:,0:10],2)))
0,0239 is the NMI of the gound truth labels and the kMeans of the original data set. Ist result is not optimal. Let's see what the NMI is, when comparing the ground truth with the datasets without outliers:
print(normalized_mutual_info_score(classesWl5Reduces, wl5LabelsReduced))
The NMI got smaller. Therefore, we know that clustering without the outliers gives a worse result.
Let's look at the dataset without outliers and the density as an extra freature attached:
print(normalized_mutual_info_score(classesWl5Reduces, wl5ExtraFeatureReducesLabels))
The NMI is now between the two values from above. So attaching the density helps to cluster better:
print(normalized_mutual_info_score(classes, wl5ExtraFeatureLabels))
This NMI is the worst from all NMIs' above. To conclude you can say, that attaching the density as an extra column only improves the NMI, when the outliers are ignored. If you apply it on the whole dataset, the NMI decreases a lot.
We consider the representations WL5, graphlet and shortest path with combined outliers only so each dataset has the same length. We then apply KMeans with different n_cluster parameter to evaluate the loss and find a common n_cluster size. The scree plots for these 3 datasets can be seen below.
wl5WithoutOutliersReduced = np.delete(wl5WithoutOutliers, np.where(wl5WithoutOutliers[:,0] == None)[0],0)
shortestPathWithoutOutliersReduced = np.delete(shortestPathWithoutOutliers, np.where(shortestPathWithoutOutliers[:,0] == None)[0],0)
graphletWithoutOutliersReduced = np.delete(graphletWithoutOutliers, np.where(graphletWithoutOutliers[:,0] == None)[0],0)
clustering.kMeans_scree_plot(wl5WithoutOutliersReduced)
clustering.kMeans_scree_plot(shortestPathWithoutOutliersReduced)
clustering.kMeans_scree_plot(graphletWithoutOutliersReduced)
wl5WithoutOutliersReduced = np.delete(kpca_wl5_gram, np.where(sharedOutliersBool == True),0)
graphletWithoutOutliersReduced = np.delete(kpca_graphlet_gram, np.where(sharedOutliersBool == True),0)
shortestPathWithoutOutliersReduced = np.delete(kpca_shortestpath_gram, np.where(sharedOutliersBool == True),0)
From the screeplots you can see, that the elbow is wher the biggest kink in the dataset is. For Weisfeller-Lehman it is 8 and for the two other datasets it is 2. Therfore we perform KMeans with the given cluster size (with both 2 & 8) and use the NMI to evaluate the pairwise results:
from sklearn.metrics import normalized_mutual_info_score
n_clusters = 2
wl5WithoutOutliersReduced_labels = clustering.kMeans(wl5WithoutOutliersReduced[:,0:10],n_clusters)
graphletWithoutOutliersReduced_labels = clustering.kMeans(graphletWithoutOutliersReduced[:,0:10],n_clusters)
shortestPathWithoutOutliersReduced_labels = clustering.kMeans(shortestPathWithoutOutliersReduced[:,0:10],n_clusters)
clustering.showPairPlot(wl5WithoutOutliersReduced[:,0:2],wl5WithoutOutliersReduced_labels,'Pairplot for WL5 without shared outliers')
clustering.showPairPlot(graphletWithoutOutliersReduced[:,0:2],graphletWithoutOutliersReduced_labels,'Pairplot for Graphlet without shared outliers')
clustering.showPairPlot(shortestPathWithoutOutliersReduced[:,0:2],shortestPathWithoutOutliersReduced_labels,'Pairplot for ShortestPath without shared outliers')
print("The NMI between Wl5 and Graphlet is: ",normalized_mutual_info_score(wl5WithoutOutliersReduced_labels,graphletWithoutOutliersReduced_labels))
print("The NMI between Wl5 and Shortest Path is: ",normalized_mutual_info_score(wl5WithoutOutliersReduced_labels,shortestPathWithoutOutliersReduced_labels))
print("The NMI between Shortest Path and Graphlet is: ",normalized_mutual_info_score(shortestPathWithoutOutliersReduced_labels,graphletWithoutOutliersReduced_labels))
n_clusters = 8
wl5WithoutOutliersReduced_labels = clustering.kMeans(wl5WithoutOutliersReduced[:,0:10],n_clusters)
graphletWithoutOutliersReduced_labels = clustering.kMeans(graphletWithoutOutliersReduced[:,0:10],n_clusters)
shortestPathWithoutOutliersReduced_labels = clustering.kMeans(shortestPathWithoutOutliersReduced[:,0:10],n_clusters)
clustering.showPairPlot(wl5WithoutOutliersReduced[:,0:2],wl5WithoutOutliersReduced_labels,'Pairplot for WL5 without shared outliers')
clustering.showPairPlot(graphletWithoutOutliersReduced[:,0:2],graphletWithoutOutliersReduced_labels,'Pairplot for Graphlet without shared outliers')
clustering.showPairPlot(shortestPathWithoutOutliersReduced[:,0:2],shortestPathWithoutOutliersReduced_labels,'Pairplot for ShortestPath without shared outliers')
print("The NMI between Wl5 and Graphlet is: ",normalized_mutual_info_score(wl5WithoutOutliersReduced_labels,graphletWithoutOutliersReduced_labels))
print("The NMI between Wl5 and Shortest Path is: ",normalized_mutual_info_score(wl5WithoutOutliersReduced_labels,shortestPathWithoutOutliersReduced_labels))
print("The NMI between Shortest Path and Graphlet is: ",normalized_mutual_info_score(shortestPathWithoutOutliersReduced_labels,graphletWithoutOutliersReduced_labels))
As you can see all the NMI scores increase significantly when increasing the number of clusters from 2 to 8. That means, that the found labels are better (more equal) if you analyze the dataset with more clusters.